quant.skyline.rda, which is output of dataProcess from section 2dataProcess in MSstatsdataProcessload('./data_day3/quant.skyline.rda')
It should use the output of quantification function in MSstats in order to get subject quantification for each protein. In this ABRF study, there is no biological replicate. Let’s use run-level quantification for protein instead as an example.
head(quant.skyline$RunlevelData)
## RUN Protein LogIntensities NumMeasuredFeature
## 1 1 sp|D6VTK4|STE2_YEAST 26.81232 8
## 2 2 sp|D6VTK4|STE2_YEAST 26.60786 8
## 3 3 sp|D6VTK4|STE2_YEAST 26.58301 8
## 4 4 sp|D6VTK4|STE2_YEAST 26.83563 8
## 5 5 sp|D6VTK4|STE2_YEAST 26.79430 8
## 6 6 sp|D6VTK4|STE2_YEAST 26.60863 8
## MissingPercentage more50missing NumImputedFeature
## 1 0 FALSE 0
## 2 0 FALSE 0
## 3 0 FALSE 0
## 4 0 FALSE 0
## 5 0 FALSE 0
## 6 0 FALSE 0
## originalRUN GROUP GROUP_ORIGINAL SUBJECT_ORIGINAL
## 1 JD_06232014_sample1_B.raw 1 Condition1 1
## 2 JD_06232014_sample1_C.raw 1 Condition1 1
## 3 JD_06232014_sample1-A.raw 1 Condition1 1
## 4 JD_06232014_sample2_A.raw 2 Condition2 2
## 5 JD_06232014_sample2_B.raw 2 Condition2 2
## 6 JD_06232014_sample2_C.raw 2 Condition2 2
## SUBJECT_NESTED SUBJECT
## 1 1.1 1
## 2 1.1 1
## 3 1.1 1
## 4 2.2 2
## 5 2.2 2
## 6 2.2 2
runquant <- quant.skyline$RunlevelData
head(runquant)
## RUN Protein LogIntensities NumMeasuredFeature
## 1 1 sp|D6VTK4|STE2_YEAST 26.81232 8
## 2 2 sp|D6VTK4|STE2_YEAST 26.60786 8
## 3 3 sp|D6VTK4|STE2_YEAST 26.58301 8
## 4 4 sp|D6VTK4|STE2_YEAST 26.83563 8
## 5 5 sp|D6VTK4|STE2_YEAST 26.79430 8
## 6 6 sp|D6VTK4|STE2_YEAST 26.60863 8
## MissingPercentage more50missing NumImputedFeature
## 1 0 FALSE 0
## 2 0 FALSE 0
## 3 0 FALSE 0
## 4 0 FALSE 0
## 5 0 FALSE 0
## 6 0 FALSE 0
## originalRUN GROUP GROUP_ORIGINAL SUBJECT_ORIGINAL
## 1 JD_06232014_sample1_B.raw 1 Condition1 1
## 2 JD_06232014_sample1_C.raw 1 Condition1 1
## 3 JD_06232014_sample1-A.raw 1 Condition1 1
## 4 JD_06232014_sample2_A.raw 2 Condition2 2
## 5 JD_06232014_sample2_B.raw 2 Condition2 2
## 6 JD_06232014_sample2_C.raw 2 Condition2 2
## SUBJECT_NESTED SUBJECT
## 1 1.1 1
## 2 1.1 1
## 3 1.1 1
## 4 2.2 2
## 5 2.2 2
## 6 2.2 2
Then let’s make wide format of data from long format.
library(reshape2)
input.wide <- dcast(Protein ~ originalRUN, data=runquant, value.var = 'LogIntensities')
head(input.wide)
## Protein JD_06232014_sample1_B.raw
## 1 sp|D6VTK4|STE2_YEAST 26.81232
## 2 sp|O13297|CET1_YEAST 24.71912
## 3 sp|O13329|FOB1_YEAST 23.37678
## 4 sp|O13539|THP2_YEAST 27.52021
## 5 sp|O13547|CCW14_YEAST 27.22234
## 6 sp|O13563|RPN13_YEAST 26.09476
## JD_06232014_sample1_C.raw JD_06232014_sample1-A.raw
## 1 26.60786 26.58301
## 2 24.67437 24.71809
## 3 24.01464 23.47075
## 4 27.38510 24.29661
## 5 26.80454 27.11638
## 6 26.29253 26.17056
## JD_06232014_sample2_A.raw JD_06232014_sample2_B.raw
## 1 26.83563 26.79430
## 2 24.52341 24.67593
## 3 23.17165 23.43427
## 4 25.88066 26.10254
## 5 27.17386 26.91302
## 6 25.90882 26.12989
## JD_06232014_sample2_C.raw JD_06232014_sample3_A.raw
## 1 26.60863 26.62966
## 2 24.57865 24.38955
## 3 23.76997 24.20100
## 4 25.90646 25.65002
## 5 26.88314 26.56416
## 6 26.01078 26.17791
## JD_06232014_sample3_B.raw JD_06232014_sample3_C.raw
## 1 26.49626 26.53029
## 2 24.62652 24.66197
## 3 23.28643 23.73741
## 4 26.33853 25.91799
## 5 26.75541 26.95027
## 6 26.03753 26.11412
## JD_06232014_sample4_B.raw JD_06232014_sample4_C.raw
## 1 26.60612 26.38611
## 2 24.64886 24.66559
## 3 23.16646 23.60780
## 4 26.70101 25.91781
## 5 26.98082 26.80274
## 6 26.07538 26.05415
## JD_06232014_sample4-A.raw
## 1 26.65573
## 2 24.50814
## 3 23.03473
## 4 25.07576
## 5 27.07526
## 6 25.77958
rownames(input.wide) <- input.wide$Protein
input.wide <- input.wide[, -1]
head(input.wide)
## JD_06232014_sample1_B.raw JD_06232014_sample1_C.raw
## sp|D6VTK4|STE2_YEAST 26.81232 26.60786
## sp|O13297|CET1_YEAST 24.71912 24.67437
## sp|O13329|FOB1_YEAST 23.37678 24.01464
## sp|O13539|THP2_YEAST 27.52021 27.38510
## sp|O13547|CCW14_YEAST 27.22234 26.80454
## sp|O13563|RPN13_YEAST 26.09476 26.29253
## JD_06232014_sample1-A.raw JD_06232014_sample2_A.raw
## sp|D6VTK4|STE2_YEAST 26.58301 26.83563
## sp|O13297|CET1_YEAST 24.71809 24.52341
## sp|O13329|FOB1_YEAST 23.47075 23.17165
## sp|O13539|THP2_YEAST 24.29661 25.88066
## sp|O13547|CCW14_YEAST 27.11638 27.17386
## sp|O13563|RPN13_YEAST 26.17056 25.90882
## JD_06232014_sample2_B.raw JD_06232014_sample2_C.raw
## sp|D6VTK4|STE2_YEAST 26.79430 26.60863
## sp|O13297|CET1_YEAST 24.67593 24.57865
## sp|O13329|FOB1_YEAST 23.43427 23.76997
## sp|O13539|THP2_YEAST 26.10254 25.90646
## sp|O13547|CCW14_YEAST 26.91302 26.88314
## sp|O13563|RPN13_YEAST 26.12989 26.01078
## JD_06232014_sample3_A.raw JD_06232014_sample3_B.raw
## sp|D6VTK4|STE2_YEAST 26.62966 26.49626
## sp|O13297|CET1_YEAST 24.38955 24.62652
## sp|O13329|FOB1_YEAST 24.20100 23.28643
## sp|O13539|THP2_YEAST 25.65002 26.33853
## sp|O13547|CCW14_YEAST 26.56416 26.75541
## sp|O13563|RPN13_YEAST 26.17791 26.03753
## JD_06232014_sample3_C.raw JD_06232014_sample4_B.raw
## sp|D6VTK4|STE2_YEAST 26.53029 26.60612
## sp|O13297|CET1_YEAST 24.66197 24.64886
## sp|O13329|FOB1_YEAST 23.73741 23.16646
## sp|O13539|THP2_YEAST 25.91799 26.70101
## sp|O13547|CCW14_YEAST 26.95027 26.98082
## sp|O13563|RPN13_YEAST 26.11412 26.07538
## JD_06232014_sample4_C.raw JD_06232014_sample4-A.raw
## sp|D6VTK4|STE2_YEAST 26.38611 26.65573
## sp|O13297|CET1_YEAST 24.66559 24.50814
## sp|O13329|FOB1_YEAST 23.60780 23.03473
## sp|O13539|THP2_YEAST 25.91781 25.07576
## sp|O13547|CCW14_YEAST 26.80274 27.07526
## sp|O13563|RPN13_YEAST 26.05415 25.77958
dim(input.wide) # there are 3027 rows (one row per protein) and 12 columns (each colomn for run)
## [1] 3027 12
Please check whether there are missing values (NAs) or not. If there is no intensity at all in certain run for certain protein, we can’t get run-summarized for that run.
sum(is.na(input.wide))
## [1] 3
There are three NAs. Then, we need to decide how to deal with NAs. * First option: remove the samples with missing values * Second option: impute the missing values. Then need to how to impute. 1) impute with zero, 2) with minimum value per protein or among all proteins, 3) with median or mean, 4) random sampling among other run-summarized value per protein.
In this case, let’s remove proteins with any NAs.
samplemissing <- apply(input.wide, 1, function(x) sum(is.na(x)))
unique(samplemissing)
## [1] 0 1 2
selectedSamples <- which(samplemissing == 0)
# Only keep the samples with no missing values
input.wide <- input.wide[selectedSamples, ]
dim(input.wide)
## [1] 3025 12
Two samples, which have any NAs, are removed.
Let’s make another data with only spike-in proteins.
input.wide.spike <- input.wide[rownames(input.wide) %in% c("sp|P44015|VAC2_YEAST",
"sp|P55752|ISCB_YEAST",
"sp|P44374|SFG2_YEAST",
"sp|P44983|UTR6_YEAST",
"sp|P44683|PGA4_YEAST",
"sp|P55249|ZRT4_YEAST"),]
input.wide.spike
## JD_06232014_sample1_B.raw JD_06232014_sample1_C.raw
## sp|P44015|VAC2_YEAST 26.30163 26.11643
## sp|P44374|SFG2_YEAST 24.26559 24.27609
## sp|P44683|PGA4_YEAST 22.47239 22.56104
## sp|P44983|UTR6_YEAST 21.29173 21.38781
## sp|P55249|ZRT4_YEAST 21.86421 21.41435
## sp|P55752|ISCB_YEAST 27.47412 27.69489
## JD_06232014_sample1-A.raw JD_06232014_sample2_A.raw
## sp|P44015|VAC2_YEAST 26.29089 25.81957
## sp|P44374|SFG2_YEAST 24.23382 21.36784
## sp|P44683|PGA4_YEAST 22.52561 20.06196
## sp|P44983|UTR6_YEAST 21.31825 27.51134
## sp|P55249|ZRT4_YEAST 21.18725 27.93246
## sp|P55752|ISCB_YEAST 27.34912 24.59934
## JD_06232014_sample2_B.raw JD_06232014_sample2_C.raw
## sp|P44015|VAC2_YEAST 26.11527 26.08498
## sp|P44374|SFG2_YEAST 21.20330 21.19631
## sp|P44683|PGA4_YEAST 20.18183 20.72974
## sp|P44983|UTR6_YEAST 27.46338 27.33788
## sp|P55249|ZRT4_YEAST 27.75780 27.83651
## sp|P55752|ISCB_YEAST 24.62946 24.70403
## JD_06232014_sample3_A.raw JD_06232014_sample3_B.raw
## sp|P44015|VAC2_YEAST 23.14806 23.32465
## sp|P44374|SFG2_YEAST 26.90000 26.85056
## sp|P44683|PGA4_YEAST 21.72482 21.93912
## sp|P44983|UTR6_YEAST 26.94886 26.92480
## sp|P55249|ZRT4_YEAST 21.04481 21.00220
## sp|P55752|ISCB_YEAST 21.37264 20.27802
## JD_06232014_sample3_C.raw JD_06232014_sample4_B.raw
## sp|P44015|VAC2_YEAST 23.29555 20.94536
## sp|P44374|SFG2_YEAST 26.90190 26.61875
## sp|P44683|PGA4_YEAST 21.96229 29.20591
## sp|P44983|UTR6_YEAST 26.85774 23.86204
## sp|P55249|ZRT4_YEAST 21.02761 20.71864
## sp|P55752|ISCB_YEAST 20.44112 27.36941
## JD_06232014_sample4_C.raw JD_06232014_sample4-A.raw
## sp|P44015|VAC2_YEAST 21.71424 20.25209
## sp|P44374|SFG2_YEAST 26.60961 26.69464
## sp|P44683|PGA4_YEAST 29.12515 29.21482
## sp|P44983|UTR6_YEAST 23.76070 24.01492
## sp|P55249|ZRT4_YEAST 20.69131 20.37886
## sp|P55752|ISCB_YEAST 27.53762 27.34758
I will save two wide format data.
save(input.wide.spike, file = 'input.wide.spike.rda')
save(input.wide.spike, file = 'input.wide.spike.rda')
prcomp functionFirst, let’s try PCA with 6 spike-in proteins only. Input has the row for proteins and the column for run (subject).
head(input.wide.spike)
## JD_06232014_sample1_B.raw JD_06232014_sample1_C.raw
## sp|P44015|VAC2_YEAST 26.30163 26.11643
## sp|P44374|SFG2_YEAST 24.26559 24.27609
## sp|P44683|PGA4_YEAST 22.47239 22.56104
## sp|P44983|UTR6_YEAST 21.29173 21.38781
## sp|P55249|ZRT4_YEAST 21.86421 21.41435
## sp|P55752|ISCB_YEAST 27.47412 27.69489
## JD_06232014_sample1-A.raw JD_06232014_sample2_A.raw
## sp|P44015|VAC2_YEAST 26.29089 25.81957
## sp|P44374|SFG2_YEAST 24.23382 21.36784
## sp|P44683|PGA4_YEAST 22.52561 20.06196
## sp|P44983|UTR6_YEAST 21.31825 27.51134
## sp|P55249|ZRT4_YEAST 21.18725 27.93246
## sp|P55752|ISCB_YEAST 27.34912 24.59934
## JD_06232014_sample2_B.raw JD_06232014_sample2_C.raw
## sp|P44015|VAC2_YEAST 26.11527 26.08498
## sp|P44374|SFG2_YEAST 21.20330 21.19631
## sp|P44683|PGA4_YEAST 20.18183 20.72974
## sp|P44983|UTR6_YEAST 27.46338 27.33788
## sp|P55249|ZRT4_YEAST 27.75780 27.83651
## sp|P55752|ISCB_YEAST 24.62946 24.70403
## JD_06232014_sample3_A.raw JD_06232014_sample3_B.raw
## sp|P44015|VAC2_YEAST 23.14806 23.32465
## sp|P44374|SFG2_YEAST 26.90000 26.85056
## sp|P44683|PGA4_YEAST 21.72482 21.93912
## sp|P44983|UTR6_YEAST 26.94886 26.92480
## sp|P55249|ZRT4_YEAST 21.04481 21.00220
## sp|P55752|ISCB_YEAST 21.37264 20.27802
## JD_06232014_sample3_C.raw JD_06232014_sample4_B.raw
## sp|P44015|VAC2_YEAST 23.29555 20.94536
## sp|P44374|SFG2_YEAST 26.90190 26.61875
## sp|P44683|PGA4_YEAST 21.96229 29.20591
## sp|P44983|UTR6_YEAST 26.85774 23.86204
## sp|P55249|ZRT4_YEAST 21.02761 20.71864
## sp|P55752|ISCB_YEAST 20.44112 27.36941
## JD_06232014_sample4_C.raw JD_06232014_sample4-A.raw
## sp|P44015|VAC2_YEAST 21.71424 20.25209
## sp|P44374|SFG2_YEAST 26.60961 26.69464
## sp|P44683|PGA4_YEAST 29.12515 29.21482
## sp|P44983|UTR6_YEAST 23.76070 24.01492
## sp|P55249|ZRT4_YEAST 20.69131 20.37886
## sp|P55752|ISCB_YEAST 27.53762 27.34758
pc <- prcomp(t(input.wide.spike))
# Inspect PCA object
summary(pc)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 5.3154 3.6198 2.5680 0.2862 0.16542 0.11815
## Proportion of Variance 0.5877 0.2726 0.1372 0.0017 0.00057 0.00029
## Cumulative Proportion 0.5877 0.8603 0.9974 0.9991 0.99971 1.00000
names(pc)
## [1] "sdev" "rotation" "center" "scale" "x"
Let’s check the proportion of explained variance. The first component has the largest variance. In this case, we need 3 components to capture most of the variation.
percent_var <- pc$sdev^2/sum(pc$sdev^2)
barplot(percent_var, xlab="Principle component", ylab="% of variance")
cum_var <- cumsum(pc$sdev^2/sum(pc$sdev^2))
barplot(cum_var, xlab="Principle component", ylab="Cumulative % of variance" )
Let’s visualize PC1 vs PC2 in scatterplot. ‘x’ include PC components for each subject.
head(pc$x)
## PC1 PC2 PC3 PC4
## JD_06232014_sample1_B.raw 0.5376666 3.932191 -3.115395 -0.06275524
## JD_06232014_sample1_C.raw 0.9065490 3.889990 -3.162756 0.16757835
## JD_06232014_sample1-A.raw 0.8629331 3.689371 -3.445799 -0.06053656
## JD_06232014_sample2_A.raw -7.3135082 1.192973 2.265224 0.25481904
## JD_06232014_sample2_B.raw -7.2865015 1.354552 2.135451 0.01480192
## JD_06232014_sample2_C.raw -6.9270993 1.512729 2.423839 -0.27208541
## PC5 PC6
## JD_06232014_sample1_B.raw -0.034407021 0.25849848
## JD_06232014_sample1_C.raw 0.060173234 -0.05855078
## JD_06232014_sample1-A.raw -0.071114248 -0.19237301
## JD_06232014_sample2_A.raw 0.003991858 0.14244137
## JD_06232014_sample2_B.raw 0.033699006 -0.10337342
## JD_06232014_sample2_C.raw -0.048023671 -0.05503116
library(ggplot2)
ggplot(aes(x=PC1, y=PC2), data=data.frame(pc$x))+
geom_point(size=4, alpha=0.5)+
theme_bw()
In order to distinguish group by colors or shape, create vector of condition labels variable called Condition. The order should be the same as column of input.
colnames(input.wide.spike)
## [1] "JD_06232014_sample1_B.raw" "JD_06232014_sample1_C.raw"
## [3] "JD_06232014_sample1-A.raw" "JD_06232014_sample2_A.raw"
## [5] "JD_06232014_sample2_B.raw" "JD_06232014_sample2_C.raw"
## [7] "JD_06232014_sample3_A.raw" "JD_06232014_sample3_B.raw"
## [9] "JD_06232014_sample3_C.raw" "JD_06232014_sample4_B.raw"
## [11] "JD_06232014_sample4_C.raw" "JD_06232014_sample4-A.raw"
Condition <- c(rep("Condition1", 3), rep("Condition2", 3),
rep("Condition3", 3), rep("Condition4", 3))
Condition
## [1] "Condition1" "Condition1" "Condition1" "Condition2" "Condition2"
## [6] "Condition2" "Condition3" "Condition3" "Condition3" "Condition4"
## [11] "Condition4" "Condition4"
# Create PC1 vs PC2 scatterplot with Condition colors
ggplot(aes(x=PC1, y=PC2, color=Group), data=data.frame(pc$x, Group=Condition))+
geom_point(size=4, alpha=0.5)+
theme_bw()
We can create PCA scatterplots with custom Condition labels and colors.
ggplot(aes(x=PC1, y=PC2, color=Group, shape=Group), data=data.frame(pc$x, Group=Condition))+
geom_point(size=4, alpha=0.5)+
theme_bw()
ggplot(aes(x=PC1, y=PC3, color=Group, shape=Group), data=data.frame(pc$x, Group=Condition))+
geom_point(size=4, alpha=0.5)+
theme_bw()
ggplot(aes(x=PC2, y=PC3, color=Group, shape=Group), data=data.frame(pc$x, Group=Condition))+
geom_point(size=4, alpha=0.5)+
theme_bw()
Let’s create plot of PC1 loadings. ‘rotation’ show the Protein loading for each component.
head(pc$rotation)
## PC1 PC2 PC3 PC4
## sp|P44015|VAC2_YEAST -0.3271751 0.3008252 -0.3804194 -0.668544685
## sp|P44374|SFG2_YEAST 0.3536458 -0.3705173 -0.2020736 0.009423747
## sp|P44683|PGA4_YEAST 0.6157778 0.0665221 0.5172269 -0.581893078
## sp|P44983|UTR6_YEAST -0.2864101 -0.4558387 0.4875131 0.093951595
## sp|P55249|ZRT4_YEAST -0.4990234 0.2270087 0.5359809 -0.116096974
## sp|P55752|ISCB_YEAST 0.2401119 0.7130736 0.1482540 0.438239928
## PC5 PC6
## sp|P44015|VAC2_YEAST 0.43898820 -0.13445862
## sp|P44374|SFG2_YEAST 0.54923192 0.62854860
## sp|P44683|PGA4_YEAST -0.05136461 -0.08735535
## sp|P44983|UTR6_YEAST 0.53571361 -0.42035144
## sp|P55249|ZRT4_YEAST -0.04695331 0.62966981
## sp|P55752|ISCB_YEAST 0.46238691 -0.07769923
ggplot(aes(x=Protein, xend=Protein, y=0, yend=PC1),
data=data.frame(pc$rotation, Protein=rownames(pc$rotation)))+
geom_segment()+
labs(y="PC1 loading")+
theme_bw()+
theme(axis.text.x = element_text(angle=90))
Let’s compare to loadings to p-values. loadings close to zero mean less contribution. Many proteins with small loadings have significant p-values.
load('./data_day3/Skyline.result.rda')
head(Skyline.result)
## Protein Label log2FC SE Tvalue DF
## 1 sp|D6VTK4|STE2_YEAST C2-C1 0.07846162 0.09648008 0.8132416 8
## 7 sp|O13297|CET1_YEAST C2-C1 -0.11119732 0.07749333 -1.4349278 8
## 13 sp|O13329|FOB1_YEAST C2-C1 -0.16209422 0.29090086 -0.5572146 8
## 19 sp|O13539|THP2_YEAST C2-C1 -0.43742107 0.82871351 -0.5278315 8
## 25 sp|O13547|CCW14_YEAST C2-C1 -0.05774773 0.14672405 -0.3935805 8
## 31 sp|O13563|RPN13_YEAST C2-C1 -0.16945187 0.09518591 -1.7802201 8
## pvalue adj.pvalue issue MissingPercentage ImputationPercentage
## 1 0.4396100 0.9991155 NA 0 0
## 7 0.1892233 0.9991155 NA 0 0
## 13 0.5926243 0.9991155 NA 0 0
## 19 0.6119387 0.9991155 NA 0 0
## 25 0.7041724 0.9991155 NA 0 0
## 31 0.1129126 0.9991155 NA 0 0
Skyline.result.spikein <- Skyline.result[Skyline.result$Protein %in% c("sp|P44015|VAC2_YEAST",
"sp|P55752|ISCB_YEAST",
"sp|P44374|SFG2_YEAST",
"sp|P44983|UTR6_YEAST",
"sp|P44683|PGA4_YEAST",
"sp|P55249|ZRT4_YEAST"), ]
Skyline.result.spikein
## Protein Label log2FC SE Tvalue DF
## 10549 sp|P44015|VAC2_YEAST C2-C1 -0.22970949 0.31123051 -0.7380687 8
## 10555 sp|P44374|SFG2_YEAST C2-C1 -3.00268709 0.04643043 -64.6706761 8
## 10561 sp|P44683|PGA4_YEAST C2-C1 -2.19517053 0.15722789 -13.9617123 8
## 10567 sp|P44983|UTR6_YEAST C2-C1 6.10493811 0.06963413 87.6716342 8
## 13279 sp|P55249|ZRT4_YEAST C2-C1 6.35365502 0.16454664 38.6130957 8
## 13285 sp|P55752|ISCB_YEAST C2-C1 -2.86176857 0.25597097 -11.1800512 8
## 10550 sp|P44015|VAC2_YEAST C3-C1 -2.98022772 0.31123051 -9.5756284 8
## 10556 sp|P44374|SFG2_YEAST C3-C1 2.62564667 0.04643043 56.5501301 8
## 10562 sp|P44683|PGA4_YEAST C3-C1 -0.64427217 0.15722789 -4.0976965 8
## 10568 sp|P44983|UTR6_YEAST C3-C1 5.57787150 0.06963413 80.1025498 8
## 13280 sp|P55249|ZRT4_YEAST C3-C1 -0.46373228 0.16454664 -2.8182422 8
## 13286 sp|P55752|ISCB_YEAST C3-C1 -6.80878285 0.25597097 -26.5998242 8
## 10551 sp|P44015|VAC2_YEAST C4-C1 -5.26575535 0.31123051 -16.9191488 8
## 10557 sp|P44374|SFG2_YEAST C4-C1 2.38249477 0.04643043 51.3132216 8
## 10563 sp|P44683|PGA4_YEAST C4-C1 6.66227860 0.15722789 42.3733902 8
## 10569 sp|P44983|UTR6_YEAST C4-C1 2.54661911 0.06963413 36.5714206 8
## 13281 sp|P55249|ZRT4_YEAST C4-C1 -0.89233327 0.16454664 -5.4229809 8
## 13287 sp|P55752|ISCB_YEAST C4-C1 -0.08784197 0.25597097 -0.3431716 8
## 10552 sp|P44015|VAC2_YEAST C3-C2 -2.75051824 0.31123051 -8.8375597 8
## 10558 sp|P44374|SFG2_YEAST C3-C2 5.62833376 0.04643043 121.2208062 8
## 10564 sp|P44683|PGA4_YEAST C3-C2 1.55089836 0.15722789 9.8640158 8
## 10570 sp|P44983|UTR6_YEAST C3-C2 -0.52706661 0.06963413 -7.5690843 8
## 13282 sp|P55249|ZRT4_YEAST C3-C2 -6.81738730 0.16454664 -41.4313380 8
## 13288 sp|P55752|ISCB_YEAST C3-C2 -3.94701428 0.25597097 -15.4197730 8
## 10553 sp|P44015|VAC2_YEAST C4-C2 -5.03604587 0.31123051 -16.1810801 8
## 10559 sp|P44374|SFG2_YEAST C4-C2 5.38518186 0.04643043 115.9838977 8
## 10565 sp|P44683|PGA4_YEAST C4-C2 8.85744912 0.15722789 56.3351025 8
## 10571 sp|P44983|UTR6_YEAST C4-C2 -3.55831900 0.06963413 -51.1002136 8
## 13283 sp|P55249|ZRT4_YEAST C4-C2 -7.24598829 0.16454664 -44.0360766 8
## 13289 sp|P55752|ISCB_YEAST C4-C2 2.77392660 0.25597097 10.8368796 8
## 10554 sp|P44015|VAC2_YEAST C4-C3 -2.28552763 0.31123051 -7.3435204 8
## 10560 sp|P44374|SFG2_YEAST C4-C3 -0.24315190 0.04643043 -5.2369085 8
## 10566 sp|P44683|PGA4_YEAST C4-C3 7.30655076 0.15722789 46.4710867 8
## 10572 sp|P44983|UTR6_YEAST C4-C3 -3.03125238 0.06963413 -43.5311293 8
## 13284 sp|P55249|ZRT4_YEAST C4-C3 -0.42860099 0.16454664 -2.6047387 8
## 13290 sp|P55752|ISCB_YEAST C4-C3 6.72094088 0.25597097 26.2566526 8
## pvalue adj.pvalue issue MissingPercentage
## 10549 4.815597e-01 9.991155e-01 NA 0.000000000
## 10555 3.635536e-12 5.502384e-09 NA 0.000000000
## 10561 6.711258e-07 5.078744e-04 NA 0.006802721
## 10567 3.197442e-13 9.678658e-10 NA 0.004545455
## 13279 2.223177e-10 2.243186e-07 NA 0.000000000
## 13285 3.669632e-06 2.221595e-03 NA 0.000000000
## 10550 1.172208e-05 8.870685e-03 NA 0.000000000
## 10556 1.061307e-11 1.606288e-08 NA 0.000000000
## 10562 3.448716e-03 3.291921e-01 NA 0.005952381
## 10568 6.576961e-13 1.990846e-09 NA 0.004545455
## 13280 2.255486e-02 4.741219e-01 NA 0.003333333
## 13286 4.291489e-09 4.330112e-06 NA 0.000000000
## 10551 1.510378e-07 1.142978e-04 NA 0.000000000
## 10557 2.304823e-11 6.976699e-08 NA 0.000000000
## 10563 1.060525e-10 1.605105e-07 NA 0.002551020
## 10569 3.425775e-10 3.456607e-07 NA 0.010606061
## 13281 6.285656e-04 3.500760e-01 NA 0.000000000
## 13287 7.403131e-01 9.726249e-01 NA 0.000000000
## 10552 2.118219e-05 1.282370e-02 NA 0.000000000
## 10558 2.398082e-14 7.258993e-11 NA 0.000000000
## 10564 9.400880e-06 7.114116e-03 NA 0.007653061
## 10570 6.490596e-05 3.274506e-02 NA 0.000000000
## 13282 1.268576e-10 1.919990e-07 NA 0.003333333
## 13288 3.110579e-07 3.138574e-04 NA 0.000000000
## 10553 2.138408e-07 1.294592e-04 NA 0.000000000
## 10559 3.419487e-14 1.035079e-10 NA 0.000000000
## 10565 1.094080e-11 1.655891e-08 NA 0.004251701
## 10571 2.382627e-11 2.404071e-08 NA 0.006060606
## 13283 7.803913e-11 5.905611e-08 NA 0.000000000
## 13289 4.644012e-06 2.342904e-03 NA 0.000000000
## 10554 8.043885e-05 6.087210e-02 NA 0.000000000
## 10560 7.862402e-04 3.194668e-01 NA 0.000000000
## 10566 5.081291e-11 1.294836e-07 NA 0.003401361
## 10572 8.555245e-11 1.294836e-07 NA 0.006060606
## 13284 3.138583e-02 7.525620e-01 NA 0.003333333
## 13290 4.756235e-09 4.799042e-06 NA 0.000000000
## ImputationPercentage
## 10549 0
## 10555 0
## 10561 0
## 10567 0
## 13279 0
## 13285 0
## 10550 0
## 10556 0
## 10562 0
## 10568 0
## 13280 0
## 13286 0
## 10551 0
## 10557 0
## 10563 0
## 10569 0
## 13281 0
## 13287 0
## 10552 0
## 10558 0
## 10564 0
## 10570 0
## 13282 0
## 13288 0
## 10553 0
## 10559 0
## 10565 0
## 10571 0
## 13283 0
## 13289 0
## 10554 0
## 10560 0
## 10566 0
## 10572 0
## 13284 0
## 13290 0
smoothScatter(pc$rotation[,1], Skyline.result.spikein[Skyline.result.spikein$Label == 'C2-C1', 'adj.pvalue'])
smoothScatter(pc$rotation[,1], Skyline.result.spikein[Skyline.result.spikein$Label == 'C3-C1', 'adj.pvalue'])
prcomp functionLet’s repeat with all proteins (3025 proteins).
head(input.wide)
## JD_06232014_sample1_B.raw JD_06232014_sample1_C.raw
## sp|D6VTK4|STE2_YEAST 26.81232 26.60786
## sp|O13297|CET1_YEAST 24.71912 24.67437
## sp|O13329|FOB1_YEAST 23.37678 24.01464
## sp|O13539|THP2_YEAST 27.52021 27.38510
## sp|O13547|CCW14_YEAST 27.22234 26.80454
## sp|O13563|RPN13_YEAST 26.09476 26.29253
## JD_06232014_sample1-A.raw JD_06232014_sample2_A.raw
## sp|D6VTK4|STE2_YEAST 26.58301 26.83563
## sp|O13297|CET1_YEAST 24.71809 24.52341
## sp|O13329|FOB1_YEAST 23.47075 23.17165
## sp|O13539|THP2_YEAST 24.29661 25.88066
## sp|O13547|CCW14_YEAST 27.11638 27.17386
## sp|O13563|RPN13_YEAST 26.17056 25.90882
## JD_06232014_sample2_B.raw JD_06232014_sample2_C.raw
## sp|D6VTK4|STE2_YEAST 26.79430 26.60863
## sp|O13297|CET1_YEAST 24.67593 24.57865
## sp|O13329|FOB1_YEAST 23.43427 23.76997
## sp|O13539|THP2_YEAST 26.10254 25.90646
## sp|O13547|CCW14_YEAST 26.91302 26.88314
## sp|O13563|RPN13_YEAST 26.12989 26.01078
## JD_06232014_sample3_A.raw JD_06232014_sample3_B.raw
## sp|D6VTK4|STE2_YEAST 26.62966 26.49626
## sp|O13297|CET1_YEAST 24.38955 24.62652
## sp|O13329|FOB1_YEAST 24.20100 23.28643
## sp|O13539|THP2_YEAST 25.65002 26.33853
## sp|O13547|CCW14_YEAST 26.56416 26.75541
## sp|O13563|RPN13_YEAST 26.17791 26.03753
## JD_06232014_sample3_C.raw JD_06232014_sample4_B.raw
## sp|D6VTK4|STE2_YEAST 26.53029 26.60612
## sp|O13297|CET1_YEAST 24.66197 24.64886
## sp|O13329|FOB1_YEAST 23.73741 23.16646
## sp|O13539|THP2_YEAST 25.91799 26.70101
## sp|O13547|CCW14_YEAST 26.95027 26.98082
## sp|O13563|RPN13_YEAST 26.11412 26.07538
## JD_06232014_sample4_C.raw JD_06232014_sample4-A.raw
## sp|D6VTK4|STE2_YEAST 26.38611 26.65573
## sp|O13297|CET1_YEAST 24.66559 24.50814
## sp|O13329|FOB1_YEAST 23.60780 23.03473
## sp|O13539|THP2_YEAST 25.91781 25.07576
## sp|O13547|CCW14_YEAST 26.80274 27.07526
## sp|O13563|RPN13_YEAST 26.05415 25.77958
dim(input.wide)
## [1] 3025 12
pc <- prcomp(t(input.wide))
# Inspect PCA object
summary(pc)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 8.5615 8.0407 6.7776 6.3000 5.70469 5.0596 4.90438
## Proportion of Variance 0.1973 0.1740 0.1236 0.1068 0.08759 0.0689 0.06474
## Cumulative Proportion 0.1973 0.3713 0.4949 0.6018 0.68935 0.7582 0.82299
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 4.6259 4.09531 3.90540 3.51317 5.935e-14
## Proportion of Variance 0.0576 0.04514 0.04105 0.03322 0.000e+00
## Cumulative Proportion 0.8806 0.92573 0.96678 1.00000 1.000e+00
names(pc)
## [1] "sdev" "rotation" "center" "scale" "x"
Let’s check the proportion of explained variance. The first component has the largest variance. In this case, we need 10-11 components to capture most of the variation.
percent_var <- pc$sdev^2/sum(pc$sdev^2)
barplot(percent_var, xlab="Principle component", ylab="% of variance")
cum_var <- cumsum(pc$sdev^2/sum(pc$sdev^2))
barplot(cum_var, xlab="Principle component", ylab="Cumulative % of variance" )
Let’s visualize PC1 vs PC2 in scatterplot. ‘x’ include PC components for each subject.
# 'x' include PC components for each subject.
head(pc$x)
## PC1 PC2 PC3 PC4
## JD_06232014_sample1_B.raw -1.917574 3.4164862 -10.015293 0.03980838
## JD_06232014_sample1_C.raw 5.225472 6.3347577 -4.182085 -0.74346099
## JD_06232014_sample1-A.raw -3.884473 1.6034036 -10.796357 -2.19975140
## JD_06232014_sample2_A.raw 1.197283 -17.3294296 -2.563257 8.54857860
## JD_06232014_sample2_B.raw 10.822842 0.1850169 -1.769981 2.69840938
## JD_06232014_sample2_C.raw 11.145497 -0.8122667 2.961406 8.15190088
## PC5 PC6 PC7 PC8
## JD_06232014_sample1_B.raw -2.13063047 0.1130047 -3.4227747 -0.71295527
## JD_06232014_sample1_C.raw 5.01570041 -5.2743242 -7.2867330 5.12845510
## JD_06232014_sample1-A.raw -0.05232091 2.0300503 -3.2690895 -4.12929066
## JD_06232014_sample2_A.raw 3.57323378 7.7025759 -0.7285404 -0.02193047
## JD_06232014_sample2_B.raw -3.58504139 -6.1833969 5.7342577 -9.05433684
## JD_06232014_sample2_C.raw -1.45746419 -4.0542433 2.4118556 7.20691739
## PC9 PC10 PC11 PC12
## JD_06232014_sample1_B.raw 5.18678879 -1.663498 -8.036395 5.787211e-14
## JD_06232014_sample1_C.raw -3.46516758 -5.352419 3.151916 5.632170e-14
## JD_06232014_sample1-A.raw -2.25243930 8.086692 3.826111 5.719036e-14
## JD_06232014_sample2_A.raw -0.80119110 -3.053138 0.808913 5.869350e-14
## JD_06232014_sample2_B.raw 1.38105058 -3.053447 2.280873 5.735907e-14
## JD_06232014_sample2_C.raw 0.08197253 6.577174 -1.924547 5.572777e-14
ggplot(aes(x=PC1, y=PC2), data=data.frame(pc$x))+
geom_point(size=4, alpha=0.5)+
theme_bw()
In order to make different colors or shapes by groups, create Create vector of disease labels variable called Condition.
colnames(input.wide)
## [1] "JD_06232014_sample1_B.raw" "JD_06232014_sample1_C.raw"
## [3] "JD_06232014_sample1-A.raw" "JD_06232014_sample2_A.raw"
## [5] "JD_06232014_sample2_B.raw" "JD_06232014_sample2_C.raw"
## [7] "JD_06232014_sample3_A.raw" "JD_06232014_sample3_B.raw"
## [9] "JD_06232014_sample3_C.raw" "JD_06232014_sample4_B.raw"
## [11] "JD_06232014_sample4_C.raw" "JD_06232014_sample4-A.raw"
Condition <- c(rep("Condition1", 3), rep("Condition2", 3),
rep("Condition3", 3), rep("Condition4", 3))
Condition
## [1] "Condition1" "Condition1" "Condition1" "Condition2" "Condition2"
## [6] "Condition2" "Condition3" "Condition3" "Condition3" "Condition4"
## [11] "Condition4" "Condition4"
# Create PC1 vs PC2 scatterplot with Condition colors
ggplot(aes(x=PC1, y=PC2, color=Group), data=data.frame(pc$x, Group=Condition))+
geom_point(size=4, alpha=0.5)+
theme_bw()
With more number proteins, we need more PC components to reach most of variation.
# check the class
class(input.wide.spike)
## [1] "data.frame"
# It is data.frame. Convert to numeric matrix
input.wide.spike <- as.matrix(input.wide.spike)
class(input.wide.spike)
## [1] "matrix"
# Visually no difference
head(input.wide.spike)
## JD_06232014_sample1_B.raw JD_06232014_sample1_C.raw
## sp|P44015|VAC2_YEAST 26.30163 26.11643
## sp|P44374|SFG2_YEAST 24.26559 24.27609
## sp|P44683|PGA4_YEAST 22.47239 22.56104
## sp|P44983|UTR6_YEAST 21.29173 21.38781
## sp|P55249|ZRT4_YEAST 21.86421 21.41435
## sp|P55752|ISCB_YEAST 27.47412 27.69489
## JD_06232014_sample1-A.raw JD_06232014_sample2_A.raw
## sp|P44015|VAC2_YEAST 26.29089 25.81957
## sp|P44374|SFG2_YEAST 24.23382 21.36784
## sp|P44683|PGA4_YEAST 22.52561 20.06196
## sp|P44983|UTR6_YEAST 21.31825 27.51134
## sp|P55249|ZRT4_YEAST 21.18725 27.93246
## sp|P55752|ISCB_YEAST 27.34912 24.59934
## JD_06232014_sample2_B.raw JD_06232014_sample2_C.raw
## sp|P44015|VAC2_YEAST 26.11527 26.08498
## sp|P44374|SFG2_YEAST 21.20330 21.19631
## sp|P44683|PGA4_YEAST 20.18183 20.72974
## sp|P44983|UTR6_YEAST 27.46338 27.33788
## sp|P55249|ZRT4_YEAST 27.75780 27.83651
## sp|P55752|ISCB_YEAST 24.62946 24.70403
## JD_06232014_sample3_A.raw JD_06232014_sample3_B.raw
## sp|P44015|VAC2_YEAST 23.14806 23.32465
## sp|P44374|SFG2_YEAST 26.90000 26.85056
## sp|P44683|PGA4_YEAST 21.72482 21.93912
## sp|P44983|UTR6_YEAST 26.94886 26.92480
## sp|P55249|ZRT4_YEAST 21.04481 21.00220
## sp|P55752|ISCB_YEAST 21.37264 20.27802
## JD_06232014_sample3_C.raw JD_06232014_sample4_B.raw
## sp|P44015|VAC2_YEAST 23.29555 20.94536
## sp|P44374|SFG2_YEAST 26.90190 26.61875
## sp|P44683|PGA4_YEAST 21.96229 29.20591
## sp|P44983|UTR6_YEAST 26.85774 23.86204
## sp|P55249|ZRT4_YEAST 21.02761 20.71864
## sp|P55752|ISCB_YEAST 20.44112 27.36941
## JD_06232014_sample4_C.raw JD_06232014_sample4-A.raw
## sp|P44015|VAC2_YEAST 21.71424 20.25209
## sp|P44374|SFG2_YEAST 26.60961 26.69464
## sp|P44683|PGA4_YEAST 29.12515 29.21482
## sp|P44983|UTR6_YEAST 23.76070 24.01492
## sp|P55249|ZRT4_YEAST 20.69131 20.37886
## sp|P55752|ISCB_YEAST 27.53762 27.34758
heatmap function in base stats packageFirst, let’s try to draw heatmap with base function.
heatmap(input.wide.spike, scale='none')
heatmap(input.wide.spike, scale='row')
heatmap(input.wide.spike, scale='column')
# Change the font of row and column label
heatmap(input.wide.spike, cexRow = 0.6, cexCol = 0.6, margins = c(2, 2))
# Don't do cluster on rows
heatmap(input.wide.spike, cexRow = 0.6, cexCol = 0.6, margins = c(2, 2), Rowv = NA)
# Don't do cluster on columns
heatmap(input.wide.spike, cexRow = 0.6, cexCol = 0.6, margins = c(2, 2), Colv = NA)
Add color side bar at th top of columns to distinguish group information by run.
group.color <- c(rep("red", 3), rep("orange", 3),
rep("yellow", 3), rep("blue", 3))
heatmap(input.wide.spike, ColSideColors=group.color)
If color scale changed, it is easy to see the difference.
library(marray)
## Loading required package: limma
my.colors <- c(maPalette(low = "darkblue", high = "white", k = 7)[-7],
"white",maPalette(low = "white", high = "darkred", k = 7)[-1])
heatmap(input.wide.spike,
col=my.colors,
ColSideColors=group.color,
scale='none')
Try different distances calculation and clustering methods. Choice of distance metric or clustering matters!
Distance options: euclidean (default), maximum, canberra, binary, minkowski, manhattan
Cluster options: complete (default), single, average, mcquitty, median, centroid, ward
# can change method for distance calculation
col_distance <- dist(t(input.wide.spike), method = "euclidean")
# can change clustering method
col_cluster <- hclust(col_distance, method = "ward.D")
heatmap(input.wide.spike,
col=my.colors,
ColSideColors=group.color,
Colv = as.dendrogram(col_cluster),
scale='none')
# check the class
class(input.wide)
## [1] "data.frame"
# It is data.frame. Convert to numeric matrix
input.wide <- as.matrix(input.wide)
class(input.wide)
## [1] "matrix"
# Visually no difference
head(input.wide)
## JD_06232014_sample1_B.raw JD_06232014_sample1_C.raw
## sp|D6VTK4|STE2_YEAST 26.81232 26.60786
## sp|O13297|CET1_YEAST 24.71912 24.67437
## sp|O13329|FOB1_YEAST 23.37678 24.01464
## sp|O13539|THP2_YEAST 27.52021 27.38510
## sp|O13547|CCW14_YEAST 27.22234 26.80454
## sp|O13563|RPN13_YEAST 26.09476 26.29253
## JD_06232014_sample1-A.raw JD_06232014_sample2_A.raw
## sp|D6VTK4|STE2_YEAST 26.58301 26.83563
## sp|O13297|CET1_YEAST 24.71809 24.52341
## sp|O13329|FOB1_YEAST 23.47075 23.17165
## sp|O13539|THP2_YEAST 24.29661 25.88066
## sp|O13547|CCW14_YEAST 27.11638 27.17386
## sp|O13563|RPN13_YEAST 26.17056 25.90882
## JD_06232014_sample2_B.raw JD_06232014_sample2_C.raw
## sp|D6VTK4|STE2_YEAST 26.79430 26.60863
## sp|O13297|CET1_YEAST 24.67593 24.57865
## sp|O13329|FOB1_YEAST 23.43427 23.76997
## sp|O13539|THP2_YEAST 26.10254 25.90646
## sp|O13547|CCW14_YEAST 26.91302 26.88314
## sp|O13563|RPN13_YEAST 26.12989 26.01078
## JD_06232014_sample3_A.raw JD_06232014_sample3_B.raw
## sp|D6VTK4|STE2_YEAST 26.62966 26.49626
## sp|O13297|CET1_YEAST 24.38955 24.62652
## sp|O13329|FOB1_YEAST 24.20100 23.28643
## sp|O13539|THP2_YEAST 25.65002 26.33853
## sp|O13547|CCW14_YEAST 26.56416 26.75541
## sp|O13563|RPN13_YEAST 26.17791 26.03753
## JD_06232014_sample3_C.raw JD_06232014_sample4_B.raw
## sp|D6VTK4|STE2_YEAST 26.53029 26.60612
## sp|O13297|CET1_YEAST 24.66197 24.64886
## sp|O13329|FOB1_YEAST 23.73741 23.16646
## sp|O13539|THP2_YEAST 25.91799 26.70101
## sp|O13547|CCW14_YEAST 26.95027 26.98082
## sp|O13563|RPN13_YEAST 26.11412 26.07538
## JD_06232014_sample4_C.raw JD_06232014_sample4-A.raw
## sp|D6VTK4|STE2_YEAST 26.38611 26.65573
## sp|O13297|CET1_YEAST 24.66559 24.50814
## sp|O13329|FOB1_YEAST 23.60780 23.03473
## sp|O13539|THP2_YEAST 25.91781 25.07576
## sp|O13547|CCW14_YEAST 26.80274 27.07526
## sp|O13563|RPN13_YEAST 26.05415 25.77958
Without scale in rows, it is hard to see the difference between groups per protein.
group.color <- c(rep("red", 3), rep("orange", 3),
rep("yellow", 3), rep("blue", 3))
# remove row information (Protein IDs)
heatmap(input.wide,
ColSideColors=group.color,
col=my.colors,
labRow = FALSE,
scale = 'none')
Protein quantification will be centered and scaled in the row direction.
# no Protein IDs
heatmap(input.wide,
ColSideColors=group.color,
col=my.colors,
labRow = FALSE,
scale = 'row')
If we change distance calculation and/or clustering method, dendrogram is different. In this case, clustering between methods are not much different.
# can change method for distance calculation
col_distance <- dist(t(input.wide), method = "euclidean")
# can change clustering method
col_cluster <- hclust(col_distance, method = "ward.D")
heatmap(input.wide,
col=my.colors,
ColSideColors=group.color,
Colv = as.dendrogram(col_cluster),
labRow = FALSE,
scale = 'row')
In this section we’ll continue using mpg and CRC dataset.
crc <- read.csv("./data_day3/CRC_train.csv")
head(crc)
## A1AG2 AFM AHSG AIAG.Bovine ANT3 AOC3 APOB
## 1 14.23816 16.10302 19.95179 15.25354 17.20794 10.033227 15.54477
## 2 15.02411 16.02071 19.71592 15.15455 17.29790 9.035202 15.13188
## 3 15.63136 16.14380 19.71085 15.59163 17.59625 10.382938 15.95530
## 4 15.40137 16.27642 19.70438 15.11819 17.42250 9.504018 15.71493
## 5 16.00316 16.95821 20.42033 15.58249 17.98820 9.651676 16.24733
## 6 13.93242 16.52772 19.88985 13.37131 16.32493 9.521379 14.51862
## ATRN BTD C20orf3 CADM1 CD163 CD44 CDH5
## 1 14.38339 16.28307 10.660954 9.743511 10.94965 9.943858 8.749720
## 2 13.98172 16.24919 10.702064 9.702175 10.83449 10.425750 9.056710
## 3 14.63536 16.49916 11.188267 10.373824 11.22385 11.026696 9.477187
## 4 14.06070 16.27773 9.966157 10.105507 11.17723 11.103752 9.866905
## 5 14.20359 16.54142 12.574731 9.690320 12.23111 11.166600 8.522353
## 6 13.70165 15.79196 10.578714 9.190440 12.21289 8.038643 9.225166
## CFH CFI CLU CP CTSD DKFZp686N02209 DSG2
## 1 17.26186 16.52607 19.32642 16.99737 9.275675 20.70295 10.22517
## 2 17.26762 16.86256 19.40444 16.22411 10.865768 21.86027 10.04611
## 3 17.98868 17.20551 19.53746 17.75641 10.029862 21.84249 11.11479
## 4 18.17423 17.52567 19.38313 18.05366 10.203826 20.70488 10.95813
## 5 18.82220 18.47344 20.19405 16.23669 10.941220 21.18504 10.87991
## 6 16.13991 16.94726 18.90905 16.34736 11.858181 21.77231 10.23335
## ECM1 F11 F5 FCGBP FETUA.Bovine FETUB FGA
## 1 13.113391 14.81509 12.23214 12.38189 17.04865 13.29704 8.144845
## 2 12.569733 14.71162 12.14809 11.68763 17.07832 13.50235 9.305395
## 3 13.661017 15.56122 12.80789 12.40291 17.12503 13.55688 9.432872
## 4 13.545199 15.95389 13.18926 13.03992 17.06354 13.12742 9.746036
## 5 13.371014 15.98843 12.54162 13.23842 17.02016 14.95305 10.066428
## 6 9.915183 14.69724 11.62907 12.69986 17.08611 13.19051 NA
## FGG FHR3 FN1 GOLM1 HP HRG HYOU1 ICAM1
## 1 11.34954 11.52154 12.55169 8.529159 19.74767 17.47524 7.504040 9.090639
## 2 12.19192 10.73714 13.47176 NA 20.12543 17.95990 9.764873 9.253566
## 3 11.49952 11.56901 12.36536 NA 20.78502 17.57534 8.646144 9.369628
## 4 11.87189 13.40320 12.17770 NA 21.87358 17.38916 8.868061 9.760900
## 5 11.59058 12.56019 14.10296 9.749398 22.79789 17.95183 8.115907 9.887392
## 6 10.68147 10.05570 11.81432 9.913008 20.71284 16.76395 8.842497 9.027833
## IGFBP3 IGHA2 IGHG2 ITIH4 KLKB1 KNG1 LAMP2
## 1 10.918418 18.90573 22.10597 15.64604 14.12489 17.65508 13.68160
## 2 10.357056 21.04688 22.44804 15.36709 14.19172 16.98601 13.48692
## 3 11.619532 18.87877 22.95173 16.40141 14.09215 17.39206 14.12212
## 4 11.425023 20.89932 22.93422 16.18817 13.48183 17.82389 14.41111
## 5 10.175458 21.26317 21.26145 15.87709 15.19997 17.95218 15.17730
## 6 9.527139 21.92661 21.00712 14.94386 13.20866 17.39358 13.94826
## LCN2 LGALS3BP LRG1 LUM LYVE1 MMRN1 MPO
## 1 9.446541 14.82147 13.96127 14.99193 11.86759 11.39385 9.729238
## 2 10.623488 15.12812 14.08645 15.41582 11.71390 11.15791 10.202602
## 3 10.024225 14.20939 15.05311 15.06369 12.19818 11.73116 9.782391
## 4 10.109405 14.87094 14.92352 14.60999 11.77828 11.95025 10.761889
## 5 11.052435 16.55743 16.46953 15.93393 12.92101 12.33475 11.727629
## 6 9.355121 16.28340 14.49637 14.47633 12.45132 11.68845 9.834259
## MRC2 MST1 NCAM1 ORM1 PGCP PIGR PLTP
## 1 10.86842 12.96116 9.905815 16.74179 8.379739 9.744116 11.85560
## 2 10.01434 13.09922 10.496499 17.24994 8.406487 9.422932 11.54273
## 3 10.56038 14.11266 10.521723 18.41524 8.243928 9.306936 12.15338
## 4 10.02103 13.92377 10.385096 18.09024 9.114961 9.417459 11.19101
## 5 11.41932 13.10718 10.517894 18.28100 8.778907 11.389195 12.12600
## 6 10.91711 13.37761 9.854809 17.02724 8.368942 9.466000 11.35824
## PLXDC2 PON1 PRG4 PROC PTPRJ Q5JNX2 SERPINA1
## 1 10.050261 16.87047 11.84463 11.109446 9.758946 19.02813 18.06109
## 2 9.974399 16.55428 12.49583 11.111383 9.934520 19.86254 17.61019
## 3 9.812975 16.34508 11.70096 11.533164 10.328319 19.50177 18.53997
## 4 9.847176 15.46083 11.09706 11.693945 9.741059 19.98466 17.40141
## 5 10.414638 16.46015 14.46691 10.699745 10.568064 20.46745 17.44801
## 6 8.387396 14.47485 11.85023 9.470798 9.922271 18.14204 17.95217
## SERPINA3 SERPINA6 SERPINA7 THBS1 TIMP1 TNC VTN VWF
## 1 14.21804 15.79685 13.58185 13.99569 11.57875 10.275261 12.06683 10.66103
## 2 14.83233 15.81540 13.13845 13.70771 11.96097 10.234434 12.83689 10.77843
## 3 15.30873 16.40177 13.74233 15.63696 12.15584 10.241626 12.31163 11.18527
## 4 14.25424 16.31049 13.93689 15.51727 12.37927 9.269195 10.67764 10.88786
## 5 16.23191 16.48583 14.16713 15.12911 12.16912 10.476927 14.50459 11.25418
## 6 12.60729 15.29659 13.23403 15.39790 12.52729 10.026299 11.52916 11.03501
## Sample Group Age Gender Cancer_stage Tumour_location Sub_group
## 1 P1A10 CRC 60 female 1 colon CRC
## 2 P1A2 CRC 70 male 1 rectum CRC
## 3 P1A4 CRC 65 male 1 rectum CRC
## 4 P1A6 CRC 65 female 4 colon CRC
## 5 P1B12 CRC 62 female 3 colon CRC
## 6 P1B2 CRC 55 male 2 colon CRC
colnames(crc)
## [1] "A1AG2" "AFM" "AHSG"
## [4] "AIAG.Bovine" "ANT3" "AOC3"
## [7] "APOB" "ATRN" "BTD"
## [10] "C20orf3" "CADM1" "CD163"
## [13] "CD44" "CDH5" "CFH"
## [16] "CFI" "CLU" "CP"
## [19] "CTSD" "DKFZp686N02209" "DSG2"
## [22] "ECM1" "F11" "F5"
## [25] "FCGBP" "FETUA.Bovine" "FETUB"
## [28] "FGA" "FGG" "FHR3"
## [31] "FN1" "GOLM1" "HP"
## [34] "HRG" "HYOU1" "ICAM1"
## [37] "IGFBP3" "IGHA2" "IGHG2"
## [40] "ITIH4" "KLKB1" "KNG1"
## [43] "LAMP2" "LCN2" "LGALS3BP"
## [46] "LRG1" "LUM" "LYVE1"
## [49] "MMRN1" "MPO" "MRC2"
## [52] "MST1" "NCAM1" "ORM1"
## [55] "PGCP" "PIGR" "PLTP"
## [58] "PLXDC2" "PON1" "PRG4"
## [61] "PROC" "PTPRJ" "Q5JNX2"
## [64] "SERPINA1" "SERPINA3" "SERPINA6"
## [67] "SERPINA7" "THBS1" "TIMP1"
## [70] "TNC" "VTN" "VWF"
## [73] "Sample" "Group" "Age"
## [76] "Gender" "Cancer_stage" "Tumour_location"
## [79] "Sub_group"
dim(crc)
## [1] 200 79
crc.prot <- crc[,1:72]
crc.anno <- crc[,73:79]
## check missing value
sum(is.na(crc.prot))
## [1] 215
# Deal with missing value
# First option: remove the samples with missing values
dim(na.omit(crc.prot))
## [1] 89 72
# Second option: impute the missing values
random.imp <- function (a){
missing <- is.na(a)
n.missing <- sum(missing)
a.obs <- a[!missing]
imputed <- a
# imputed[missing] <- 0
# imputed[missing] <- sample(a.obs, n.missing, replace=TRUE)
# imputed[missing] <- median(a.obs)
imputed[missing] <- min(a.obs)
return (imputed)
}
pMiss <- function(x){
sum(is.na(x))/length(x)*100
}
# 1. Only keep the proteins with less than 5% missing values
proteinmissing <- apply(crc.prot, 2, pMiss)
selectedprotein <- which(proteinmissing <= 5)
crc.prot <- crc.prot[, selectedprotein]
# 2. Only keep the samples with less than 5% missing values
samplemissing <- apply(crc.prot, 1, pMiss)
selectedSamples <- which(samplemissing <= 5)
# Then, impute the missing value
imputed.crc.prot <- t(apply(crc.prot[selectedSamples,], 1, function(x) random.imp(x)))
imputed.crc.anno <- crc.anno[selectedSamples, ]
dim(imputed.crc.anno)
## [1] 200 7
library(genefilter)
## heatmap function in base stats package
heatmap(t(imputed.crc.prot))
# Non-scaled
heatmap(t(imputed.crc.prot), cexRow = 0.3, cexCol = 0.2, margins = c(2, 2), Colv = NA, scale='none')
# scaled
heatmap(t(imputed.crc.prot), cexRow = 0.3, cexCol = 0.2, margins = c(2, 2), Colv = NA)
heatmap(t(imputed.crc.prot), cexRow = 0.3, cexCol = 0.2, margins = c(2, 2), Colv = NA,
col=my.colors)
unique(imputed.crc.anno$Sub_group)
## [1] CRC Benign Healthy
## Levels: Benign CRC Healthy
myColor <- rep("blue", nrow(imputed.crc.prot))
myColor[imputed.crc.anno$Sub_group == "CRC"] <- "red"
myColor[imputed.crc.anno$Sub_group == "Benign"] <- "yellow"
quantile-spaced breaks in positive and negative directions
# change color code: protein-standardized expressions
# quantile-spaced breaks in positive and negative directions
imputed_CRC_prot_centerScale <- ( t(imputed.crc.prot) - rowMeans(t(imputed.crc.prot)) ) / rowSds(t(imputed.crc.prot))
exprs_brk <- c( quantile(imputed_CRC_prot_centerScale[imputed_CRC_prot_centerScale < 0], probs=seq(0, 1, length=10)), 0, quantile(imputed_CRC_prot_centerScale[imputed_CRC_prot_centerScale > 0], seq(0, 1, length=10)))
colors <- colorRampPalette(c("darkblue", "white", "darkred"))(length(exprs_brk)-1)
heatmap(t(imputed.crc.prot),
cexRow = 0.3, cexCol = 0.2, margins = c(2, 2), Rowv = NA,
col = colors, breaks = exprs_brk, ColSideColors=myColor)
# can change method for distance calculation
col_distance <- dist(imputed.crc.prot, method = "euclidean")
# can change clustering method
col_cluster <- hclust(col_distance, method = "ward.D")
heatmap(t(imputed.crc.prot),
cexRow = 0.3, cexCol = 0.2, margins = c(2, 2),
col = colors,
breaks = exprs_brk,
ColSideColors = myColor,
Rowv = NA,
Colv = as.dendrogram(col_cluster))